{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction to DataFrames\n",
    "**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "using DataFrames\n",
    "using BenchmarkTools"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Performance tips"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Access by column number is faster than by name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  40.121 ns (0 allocations: 0 bytes)\n",
      "  61.510 ns (0 allocations: 0 bytes)\n"
     ]
    }
   ],
   "source": [
    "x = DataFrame(rand(5, 1000))\n",
    "@btime x[500];\n",
    "@btime x[:x500];"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### When working with data `DataFrame` use barrier functions or type annotation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  139.108 ms (5999031 allocations: 122.06 MiB)\n"
     ]
    }
   ],
   "source": [
    "function f_bad() # this function will be slow\n",
    "    srand(1); x = DataFrame(rand(1000000,2))\n",
    "    y, z = x[1], x[2]\n",
    "    p = 0.0\n",
    "    for i in 1:nrow(x)\n",
    "        p += y[i]*z[i]\n",
    "    end\n",
    "    p\n",
    "end\n",
    "\n",
    "@btime f_bad();"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Variables:\n",
      "  #self# <optimized out>\n",
      "  i::Int64\n",
      "  #temp#@_3::Int64\n",
      "  x::DataFrames.DataFrame\n",
      "  y <optimized out>\n",
      "  z <optimized out>\n",
      "  p\u001b[1m\u001b[91m::Any\u001b[39m\u001b[22m\n",
      "  selected_column@_8 <optimized out>\n",
      "  selected_column@_9 <optimized out>\n",
      "  #temp#@_10::Int64\n",
      "\n",
      "Body:\n",
      "  begin \n",
      "      $(Expr(:invoke, MethodInstance for srand(::Int64), :(Main.srand), 1)) # line 2:\n",
      "      $(Expr(:inbounds, false))\n",
      "      # meta: location random.jl rand 285\n",
      "      SSAValue(9) = 1000000\n",
      "      SSAValue(10) = 2\n",
      "      # meta: location random.jl rand 284\n",
      "      # meta: location random.jl rand 387\n",
      "      # meta: location random.jl rand 390\n",
      "      SSAValue(8) = $(Expr(:foreigncall, :(:jl_alloc_array_2d), Array{Float64,2}, svec(Any, Int64, Int64), Array{Float64,2}, 0, SSAValue(9), 0, SSAValue(10), 0))\n",
      "      # meta: pop location\n",
      "      # meta: pop location\n",
      "      # meta: pop location\n",
      "      # meta: pop location\n",
      "      $(Expr(:inbounds, :pop))\n",
      "      SSAValue(11) = $(Expr(:invoke, MethodInstance for rand!(::MersenneTwister, ::Array{Float64,2}, ::Int64, ::Type{Base.Random.CloseOpen}), :(Base.Random.rand!), :(Base.Random.GLOBAL_RNG), SSAValue(8), :((Base.arraylen)(SSAValue(8))::Int64), :(Base.Random.CloseOpen)))\n",
      "      $(Expr(:inbounds, false))\n",
      "      # meta: location D:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\dataframe/dataframe.jl Type 161\n",
      "      SSAValue(12) = $(Expr(:invoke, MethodInstance for gennames(::Int64), :(DataFrames.gennames), :((Base.arraysize)(SSAValue(11), 2)::Int64)))\n",
      "      # meta: pop location\n",
      "      $(Expr(:inbounds, :pop))\n",
      "      x::DataFrames.DataFrame = $(Expr(:invoke, MethodInstance for #DataFrame#60(::Bool, ::Type{T} where T, ::Array{Float64,2}, ::Array{Symbol,1}), :(DataFrames.#DataFrame#60), false, :(Main.DataFrame), SSAValue(11), SSAValue(12))) # line 3:\n",
      "      SSAValue(0) = (Base.arrayref)((Core.getfield)(x::DataFrames.DataFrame, :columns)\u001b[1m\u001b[91m::Array{T,1} where T\u001b[39m\u001b[22m, 1)\u001b[1m\u001b[91m::Any\u001b[39m\u001b[22m\n",
      "      SSAValue(1) = (Base.arrayref)((Core.getfield)(x::DataFrames.DataFrame, :columns)\u001b[1m\u001b[91m::Array{T,1} where T\u001b[39m\u001b[22m, 2)\u001b[1m\u001b[91m::Any\u001b[39m\u001b[22m # line 4:\n",
      "      p\u001b[1m\u001b[91m::Any\u001b[39m\u001b[22m = 0.0 # line 5:\n",
      "      $(Expr(:inbounds, false))\n",
      "      # meta: location D:\\Software\\JULIA_PKG\\v0.6\\DataFrames\\src\\dataframe/dataframe.jl nrow 228\n",
      "      unless (Base.slt_int)(0, (Base.arraylen)((Core.getfield)((Core.getfield)(x::DataFrames.DataFrame, :colindex)::DataFrames.Index, :names)::Array{Symbol,1})::Int64)::Bool goto 34\n",
      "      #temp#@_10::Int64 = (Core.typeassert)((DataFrames.length)((Base.arrayref)((Core.getfield)(x::DataFrames.DataFrame, :columns)\u001b[1m\u001b[91m::Array{T,1} where T\u001b[39m\u001b[22m, 1)\u001b[1m\u001b[91m::Any\u001b[39m\u001b[22m)\u001b[1m\u001b[91m::Any\u001b[39m\u001b[22m, DataFrames.Int)::Int64\n",
      "      goto 36\n",
      "      34: \n",
      "      #temp#@_10::Int64 = 0\n",
      "      36: \n",
      "      # meta: pop location\n",
      "      $(Expr(:inbounds, :pop))\n",
      "      SSAValue(13) = (Base.select_value)((Base.sle_int)(1, #temp#@_10::Int64)::Bool, #temp#@_10::Int64, (Base.sub_int)(1, 1)::Int64)::Int64\n",
      "      #temp#@_3::Int64 = 1\n",
      "      41: \n",
      "      unless (Base.not_int)((#temp#@_3::Int64 === (Base.add_int)(SSAValue(13), 1)::Int64)::Bool)::Bool goto 51\n",
      "      SSAValue(14) = #temp#@_3::Int64\n",
      "      SSAValue(15) = (Base.add_int)(#temp#@_3::Int64, 1)::Int64\n",
      "      i::Int64 = SSAValue(14)\n",
      "      #temp#@_3::Int64 = SSAValue(15) # line 6:\n",
      "      p\u001b[1m\u001b[91m::Any\u001b[39m\u001b[22m = (p\u001b[1m\u001b[91m::Any\u001b[39m\u001b[22m + ((Main.getindex)(SSAValue(0), i::Int64)\u001b[1m\u001b[91m::Any\u001b[39m\u001b[22m * (Main.getindex)(SSAValue(1), i::Int64)\u001b[1m\u001b[91m::Any\u001b[39m\u001b[22m)\u001b[1m\u001b[91m::Any\u001b[39m\u001b[22m)\u001b[1m\u001b[91m::Any\u001b[39m\u001b[22m\n",
      "      49: \n",
      "      goto 41\n",
      "      51:  # line 8:\n",
      "      return p\u001b[1m\u001b[91m::Any\u001b[39m\u001b[22m\n",
      "  end\u001b[1m\u001b[91m::Any\u001b[39m\u001b[22m\n"
     ]
    }
   ],
   "source": [
    "@code_warntype f_bad() # the reason is that Julia does not know the types of columns in `DataFrame`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  17.597 ms (53 allocations: 30.52 MiB)\n",
      "  21.162 ms (53 allocations: 30.52 MiB)\n"
     ]
    }
   ],
   "source": [
    "# solution 1 is to use barrier function (it should be possible to use it in almost any code)\n",
    "function f_inner(y,z)\n",
    "   p = 0.0\n",
    "   for i in 1:length(y)\n",
    "       p += y[i]*z[i]\n",
    "   end\n",
    "   p\n",
    "end\n",
    "\n",
    "function f_barrier() # extract the work to an inner function\n",
    "    srand(1); x = DataFrame(rand(1000000,2))\n",
    "    f_inner(x[1], x[2])\n",
    "end\n",
    "\n",
    "function f_inbuilt() # or use inbuilt function if possible\n",
    "    srand(1); x = DataFrame(rand(1000000,2))\n",
    "    dot(x[1], x[2])\n",
    "end\n",
    "\n",
    "@btime f_barrier();\n",
    "@btime f_inbuilt();"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  23.551 ms (53 allocations: 30.52 MiB)\n"
     ]
    }
   ],
   "source": [
    "# solution 2 is to provide the types of extracted columns\n",
    "# it is simpler but there are cases in which you will not know these types\n",
    "function f_typed()\n",
    "    srand(1); x = DataFrame(rand(1000000,2))\n",
    "    y::Vector{Float64}, z::Vector{Float64} = x[1], x[2]\n",
    "    p = 0.0\n",
    "    for i in 1:nrow(x)\n",
    "        p += y[i]*z[i]\n",
    "    end\n",
    "    p\n",
    "end\n",
    "\n",
    "@btime f_typed();"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Consider using delayed `DataFrame` creation technique"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  37.652 ms (1950238 allocations: 37.43 MiB)\n",
      "  4.126 ms (1036 allocations: 7.69 MiB)\n"
     ]
    }
   ],
   "source": [
    "function f1()\n",
    "    x = DataFrame(Float64, 10^4, 100) # we work with DataFrame directly\n",
    "    for c in 1:ncol(x)\n",
    "        d = x[c]\n",
    "        for r in 1:nrow(x)\n",
    "            d[r] = rand()\n",
    "        end\n",
    "    end\n",
    "    x\n",
    "end\n",
    "\n",
    "function f2()\n",
    "    x = Vector{Any}(100)\n",
    "    for c in 1:length(x)\n",
    "        d = Vector{Float64}(10^4)\n",
    "        for r in 1:length(d)\n",
    "            d[r] = rand()\n",
    "        end\n",
    "        x[c] = d\n",
    "    end\n",
    "    DataFrame(x) # we delay creation of DataFrame after we have our job done\n",
    "end\n",
    "\n",
    "@btime f1();\n",
    "@btime f2();"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### You can add rows to a `DataFrame` in place and it is fast"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  17.043 ms (141 allocations: 38.15 MiB)\n",
      "  23.793 μs (12 allocations: 576 bytes)\n",
      "  329.680 ns (5 allocations: 80 bytes)\n"
     ]
    }
   ],
   "source": [
    "x = DataFrame(rand(10^6, 5))\n",
    "y = DataFrame(transpose(1.0:5.0))\n",
    "z = [1.0:5.0;]\n",
    "\n",
    "@btime vcat($x, $y); # creates a new DataFrame - slow\n",
    "@btime append!($x, $y); # in place - fast\n",
    "\n",
    "x = DataFrame(rand(10^6, 5)) # reset to the same starting point\n",
    "@btime push!($x, $z); # add a single row in place - fastest"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Allowing `missing` as well as `categorical` slows down computations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Int64\n",
      " raw:\n",
      "  6.739 ms (15 allocations: 7.63 MiB)\n",
      " categorical:\n",
      "  36.355 ms (5 allocations: 704 bytes)\n",
      "String\n",
      " raw:\n",
      "  30.456 ms (5 allocations: 704 bytes)\n",
      " categorical:\n",
      "  43.805 ms (5 allocations: 704 bytes)\n",
      "Union{Int64, Missings.Missing}\n",
      " raw:\n",
      "  41.849 ms (5 allocations: 704 bytes)\n",
      " categorical:\n",
      "  53.116 ms (5 allocations: 704 bytes)\n",
      "Union{Missings.Missing, String}\n",
      " raw:\n",
      "  63.875 ms (5 allocations: 704 bytes)\n",
      " categorical:\n",
      "  86.547 ms (5 allocations: 704 bytes)\n"
     ]
    }
   ],
   "source": [
    "using StatsBase\n",
    "\n",
    "function test(data) # uses countmap function to test performance\n",
    "    println(eltype(data))\n",
    "    x = rand(data, 10^6)\n",
    "    y = categorical(x)\n",
    "    println(\" raw:\")\n",
    "    @btime countmap($x)\n",
    "    println(\" categorical:\")\n",
    "    @btime countmap($y)\n",
    "    nothing\n",
    "end\n",
    "\n",
    "test(1:10)\n",
    "test([randstring() for i in 1:10])\n",
    "test(allowmissing(1:10))\n",
    "test(allowmissing([randstring() for i in 1:10]))\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Julia 0.6.2",
   "language": "julia",
   "name": "julia-0.6"
  },
  "language_info": {
   "file_extension": ".jl",
   "mimetype": "application/julia",
   "name": "julia",
   "version": "0.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
